{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1755dad8-dfb5-4595-8cb5-c8af66d136e2",
   "metadata": {},
   "source": [
    "#### Objective:- map blocks and compute population for 2010 and 2020 census data "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a9e480d8-272f-4f73-875c-e965d2566467",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import geopandas as gpd\n",
    "import ast,os,random\n",
    "pd.set_option('display.float_format','{:.1f}'.format)\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "import cudf, cupy as cp\n",
    "import numpy as np\n",
    "import time\n",
    "# pd.set_option('display.max_colwidth', -1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "043ed6e6-d052-4eac-bf14-c809aa8bcc46",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a35321e8-39e1-4d6e-9fcc-16900937636e",
   "metadata": {},
   "outputs": [],
   "source": [
    "full_2020_path='data/nhgis0007_csv/nhgis0007_ds248_2020_block.csv'\n",
    "full_2010_path='data/nhgis0001_csv/nhgis0001_ds172_2010_block.csv'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e9bcdef-ca9d-488b-9f07-f0ecf02c37db",
   "metadata": {},
   "source": [
    "#### Prepare 2010 data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e39f2b9c-0cf9-491e-92f6-7c790657fa71",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "full = cudf.read_csv(full_2010_path, usecols=['GISJOIN', 'H7V001' ,'STATEA'],dtype={'GISJOIN':'str','H7V001':'str','STATEA':'int'})\n",
    "# full.STATEA = full.STATEA.astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "bb88274e-64c3-4c94-8a92-aeeb80e8f254",
   "metadata": {},
   "outputs": [],
   "source": [
    "# small = full[ (full.STATEA ==11) | (full.STATEA ==10)] \n",
    "# data10 = small.copy()\n",
    "data10 = full.copy()\n",
    "\n",
    "del(full)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "566ddf39-691f-4920-8fd6-23e16537f622",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GISJOIN</th>\n",
       "      <th>STATE</th>\n",
       "      <th>P10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>01000100201001000</td>\n",
       "      <td>1</td>\n",
       "      <td>61</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>01000100201001001</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>01000100201001002</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>01000100201001003</td>\n",
       "      <td>1</td>\n",
       "      <td>75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>01000100201001004</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             GISJOIN  STATE P10\n",
       "0  01000100201001000      1  61\n",
       "1  01000100201001001      1   0\n",
       "2  01000100201001002      1   0\n",
       "3  01000100201001003      1  75\n",
       "4  01000100201001004      1   0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data10.GISJOIN = data10.GISJOIN.str.replace('G', '')\n",
    "data10.rename(columns={'H7V001':'P10','STATEA':'STATE'},inplace=True)\n",
    "data10.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "47bb172c-26fe-4ca1-85dd-6113deee11be",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11155486"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a8c0227e-c40c-4af1-a529-f7310e181162",
   "metadata": {},
   "outputs": [],
   "source": [
    "data10['ID10'] = (data10.GISJOIN.str.slice(start=0,stop=2) + data10.GISJOIN.str.slice(start=3,stop=6) + data10.GISJOIN.str.slice(start=7)).astype('int64')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0c814443-ddec-439e-bd89-e377f2362b5b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GISJOIN</th>\n",
       "      <th>STATE</th>\n",
       "      <th>P10</th>\n",
       "      <th>ID10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>01000100201001000</td>\n",
       "      <td>1</td>\n",
       "      <td>61</td>\n",
       "      <td>10010201001000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>01000100201001001</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>10010201001001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>01000100201001002</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>10010201001002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>01000100201001003</td>\n",
       "      <td>1</td>\n",
       "      <td>75</td>\n",
       "      <td>10010201001003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>01000100201001004</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>10010201001004</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             GISJOIN  STATE P10            ID10\n",
       "0  01000100201001000      1  61  10010201001000\n",
       "1  01000100201001001      1   0  10010201001001\n",
       "2  01000100201001002      1   0  10010201001002\n",
       "3  01000100201001003      1  75  10010201001003\n",
       "4  01000100201001004      1   0  10010201001004"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data10.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd85c23e-436c-432a-820d-601c1ddc9324",
   "metadata": {},
   "source": [
    "#### Prepare 2020 data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8b9a9d81-30fe-43b3-87ca-40dbfef6085c",
   "metadata": {},
   "outputs": [],
   "source": [
    "full = cudf.read_csv(full_2020_path,usecols=['GEOCODE','GISJOIN','STATEA','COUNTY','U7B001'],dtype={'GEOCODE':'int64','COUNTY':'str','GISJOIN':'str','U7B001':'int32','STATEA':'int32'})\n",
    "# full.STATEA = full.STATEA.astype('int')\n",
    "# small = full[ (full.STATEA ==11) | (full.STATEA ==10)] \n",
    "# data20 = cudf.from_pandas(small)\n",
    "data20 = full.copy()\n",
    "del(full)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "96b4f00a-3a00-4c26-8822-a909b8f2c0f8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "42    12463\n",
       "39     9932\n",
       "24     9513\n",
       "48     6920\n",
       "1      4793\n",
       "20     3150\n",
       "47     2954\n",
       "17     2846\n",
       "36     2562\n",
       "51     1832\n",
       "29     1787\n",
       "18     1705\n",
       "37     1460\n",
       "19     1385\n",
       "5      1041\n",
       "28      796\n",
       "13      740\n",
       "21      645\n",
       "Name: STATEA, dtype: int32"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data20[data20.COUNTY=='Montgomery County'].STATEA.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "cf0bf0a4-dd54-4625-b20a-1752e84aabbd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GISJOIN</th>\n",
       "      <th>ID20</th>\n",
       "      <th>STATE</th>\n",
       "      <th>COUNTY</th>\n",
       "      <th>P20</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1000100201001000</td>\n",
       "      <td>10010201001000</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1000100201001001</td>\n",
       "      <td>10010201001001</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1000100201001002</td>\n",
       "      <td>10010201001002</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1000100201001003</td>\n",
       "      <td>10010201001003</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1000100201001004</td>\n",
       "      <td>10010201001004</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            GISJOIN            ID20  STATE          COUNTY  P20\n",
       "0  1000100201001000  10010201001000      1  Autauga County   21\n",
       "1  1000100201001001  10010201001001      1  Autauga County   34\n",
       "2  1000100201001002  10010201001002      1  Autauga County   29\n",
       "3  1000100201001003  10010201001003      1  Autauga County   17\n",
       "4  1000100201001004  10010201001004      1  Autauga County    0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# df = small.copy()\n",
    "data20.rename(columns={'U7B001':'P20','GEOCODE':'ID20','STATEA':'STATE'},inplace=True)\n",
    "data20.GISJOIN = data20.GISJOIN.str.replace('G', '').astype('int64')\n",
    "data20.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "3c5b29e4-5fd7-4c3c-9982-a89d50e11c80",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8174955"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data20)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "de979e52-5821-4448-8d24-a18e207cffe8",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Mapper "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fa404aeb-7e15-4e60-8808-a167036ab838",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### Concat mapper files for states"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "71392b15-993d-4263-98ff-b10c33f5b701",
   "metadata": {},
   "outputs": [],
   "source": [
    "states = {1 :\"AL\",2 :\"AK\",4 :\"AZ\",5 :\"AR\",6 :\"CA\",8 :\"CO\",9 :\"CT\",10:\"DE\",11:\"DC\",12:\"FL\",13:\"GA\",15:\"HI\",\n",
    "          16:\"ID\",17:\"IL\",18:\"IN\",19:\"IA\",20:\"KS\",21:\"KY\",22:\"LA\",23:\"ME\",24:\"MD\",25:\"MA\",26:\"MI\",27:\"MN\",\n",
    "          28:\"MS\",29:\"MO\",30:\"MT\",31:\"NE\",32:\"NV\",33:\"NH\",34:\"NJ\",35:\"NM\",36:\"NY\",37:\"NC\",38:\"ND\",39:\"OH\",\n",
    "          40:\"OK\",41:\"OR\",42:\"PA\",44:\"RI\",45:\"SC\",46:\"SD\",47:\"TN\",48:\"TX\",49:\"UT\",50:\"VT\",51:\"VA\",53:\"WA\",\n",
    "          54:\"WV\",55:\"WI\",56:\"WY\",72:\"PR\"}\n",
    "\n",
    "# states = {11:\"DC\",10:\"DE\"}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f3cff1f3-5ec8-45d0-b831-3d3a092908dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "def concat_states_mapper(state_key_list): \n",
    "    \n",
    "    concat_mapper = cudf.DataFrame()\n",
    "    for i in state_key_list:\n",
    "        if i< 10:\n",
    "            i_str = '0'+str(i)\n",
    "        else:\n",
    "            i_str = str(i)\n",
    "        path = 'data/block_rel_files/tab2010_tab2020_st%s_%s.csv'%(i_str,states[i].lower())\n",
    "        if os.path.isfile(path):    \n",
    "            mapper = cudf.read_csv(path,delimiter='|')\n",
    "            concat_mapper = cudf.concat([concat_mapper,mapper])\n",
    "        else:\n",
    "            print(\"mapper file does not exist\")\n",
    "            continue\n",
    "    return concat_mapper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "900eaf04-28de-480d-8419-bcf789547a87",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>STATE_2010</th>\n",
       "      <th>COUNTY_2010</th>\n",
       "      <th>TRACT_2010</th>\n",
       "      <th>BLK_2010</th>\n",
       "      <th>BLKSF_2010</th>\n",
       "      <th>AREALAND_2010</th>\n",
       "      <th>AREAWATER_2010</th>\n",
       "      <th>BLOCK_PART_FLAG_O</th>\n",
       "      <th>STATE_2020</th>\n",
       "      <th>COUNTY_2020</th>\n",
       "      <th>TRACT_2020</th>\n",
       "      <th>BLK_2020</th>\n",
       "      <th>BLKSF_2020</th>\n",
       "      <th>AREALAND_2020</th>\n",
       "      <th>AREAWATER_2020</th>\n",
       "      <th>BLOCK_PART_FLAG_R</th>\n",
       "      <th>AREALAND_INT</th>\n",
       "      <th>AREAWATER_INT</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>20100</td>\n",
       "      <td>1000</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>482628</td>\n",
       "      <td>0</td>\n",
       "      <td>p</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>20100</td>\n",
       "      <td>1000</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>288702</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>288702</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>20100</td>\n",
       "      <td>1000</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>482628</td>\n",
       "      <td>0</td>\n",
       "      <td>p</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>20100</td>\n",
       "      <td>1001</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>194408</td>\n",
       "      <td>0</td>\n",
       "      <td>p</td>\n",
       "      <td>193926</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   STATE_2010  COUNTY_2010  TRACT_2010  BLK_2010 BLKSF_2010  AREALAND_2010  \\\n",
       "0           1            1       20100      1000       <NA>         482628   \n",
       "1           1            1       20100      1000       <NA>         482628   \n",
       "\n",
       "   AREAWATER_2010 BLOCK_PART_FLAG_O  STATE_2020  COUNTY_2020  TRACT_2020  \\\n",
       "0               0                 p           1            1       20100   \n",
       "1               0                 p           1            1       20100   \n",
       "\n",
       "   BLK_2020 BLKSF_2020  AREALAND_2020  AREAWATER_2020 BLOCK_PART_FLAG_R  \\\n",
       "0      1000       <NA>         288702               0              <NA>   \n",
       "1      1001       <NA>         194408               0                 p   \n",
       "\n",
       "   AREALAND_INT  AREAWATER_INT  \n",
       "0        288702              0  \n",
       "1        193926              0  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapper_df = concat_states_mapper(states.keys())\n",
    "mapper_df.rename(columns={'﻿STATE_2010':'STATE_2010'},inplace=True)\n",
    "mapper_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "f8829da8-8174-460f-a841-b841e5121349",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>ID10</th>\n",
       "      <th>ID20</th>\n",
       "      <th>STATE_2020</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>10010201001000</td>\n",
       "      <td>10010201001000</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>10010201001000</td>\n",
       "      <td>10010201001001</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>10010201001001</td>\n",
       "      <td>10010201001001</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>10010201001002</td>\n",
       "      <td>10010201001002</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>10010201001003</td>\n",
       "      <td>10010201001002</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   index            ID10            ID20  STATE_2020\n",
       "0      0  10010201001000  10010201001000           1\n",
       "1      1  10010201001000  10010201001001           1\n",
       "2      2  10010201001001  10010201001001           1\n",
       "3      3  10010201001002  10010201001002           1\n",
       "4      4  10010201001003  10010201001002           1"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mapper_df['ID10'] = (mapper_df.STATE_2010.astype('str').str.rjust(2,'0') + mapper_df.COUNTY_2010.astype('str').str.rjust(3,'0') + mapper_df.TRACT_2010.astype('str').str.rjust(6,'0') + mapper_df.BLK_2010.astype('str').str.rjust(4,'0')).astype('int64')\n",
    "mapper_df['ID20'] = (mapper_df.STATE_2020.astype('str').str.rjust(2,'0') + mapper_df.COUNTY_2020.astype('str').str.rjust(3,'0') + mapper_df.TRACT_2020.astype('str').str.rjust(6,'0') + mapper_df.BLK_2020.astype('str').str.rjust(4,'0')).astype('int64')\n",
    "mapper_df = mapper_df[['ID10','ID20','STATE_2020']].reset_index()\n",
    "mapper_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "02e10ddb-4a2b-4345-b927-922f534796f8",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### Create Mapped IDs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4bb036f4-c193-4a8f-8593-0beba54e56e0",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### Single Mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "13041ee2-4f3d-46c8-9ccf-0a937d748651",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID10</th>\n",
       "      <th>P10</th>\n",
       "      <th>freq</th>\n",
       "      <th>weights</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7392</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>61.0</td>\n",
       "      <td>2</td>\n",
       "      <td>30.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7393</th>\n",
       "      <td>10010201001001</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7394</th>\n",
       "      <td>10010201001002</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7395</th>\n",
       "      <td>10010201001003</td>\n",
       "      <td>75.0</td>\n",
       "      <td>2</td>\n",
       "      <td>37.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7396</th>\n",
       "      <td>10010201001004</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                ID10  P10  freq  weights\n",
       "7392  10010201001000 61.0     2     30.5\n",
       "7393  10010201001001  0.0     1      0.0\n",
       "7394  10010201001002  0.0     1      0.0\n",
       "7395  10010201001003 75.0     2     37.5\n",
       "7396  10010201001004  0.0     1      0.0"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "freq_df = mapper_df.ID10.value_counts().reset_index().sort_values('index')\n",
    "freq_df.rename(columns={'ID10':'freq'},inplace=True)\n",
    "freq_df.rename(columns={'index':'ID10'},inplace=True)\n",
    "freq_df = freq_df.reset_index(drop=True)\n",
    "data10.P10 = data10.P10.astype('float32')\n",
    "freq_df = cudf.merge(freq_df,data10[['ID10','P10']],on='ID10',how='right').sort_values('ID10')\n",
    "freq_df['weights'] = freq_df['P10'].divide(freq_df['freq'])\n",
    "freq_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "e2441ad2-b699-4db2-b7a3-871011b70822",
   "metadata": {},
   "outputs": [],
   "source": [
    "weighted_mapper = cudf.merge(mapper_df,freq_df[['ID10','weights']],on='ID10',how='left').sort_values('ID20').reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "0ef7b76d-8022-4ab2-9191-5fc21e857ec7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID20</th>\n",
       "      <th>weights</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>30.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10010201001001</td>\n",
       "      <td>30.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10010201001002</td>\n",
       "      <td>51.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10010201001003</td>\n",
       "      <td>13.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10010201001004</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             ID20  weights\n",
       "0  10010201001000     30.5\n",
       "1  10010201001001     30.5\n",
       "2  10010201001002     51.8\n",
       "3  10010201001003     13.3\n",
       "4  10010201001004      0.0"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eq_10 = weighted_mapper.groupby('ID20')['weights'].sum().reset_index().sort_values('ID20').reset_index(drop=True) \n",
    "eq_10.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "077192cb-70c8-44fa-a77d-e501fb2159cc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>ID10</th>\n",
       "      <th>ID20</th>\n",
       "      <th>STATE_2020</th>\n",
       "      <th>weights</th>\n",
       "      <th>eq_P10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>10010201001000</td>\n",
       "      <td>10010201001000</td>\n",
       "      <td>1</td>\n",
       "      <td>30.5</td>\n",
       "      <td>30.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>10010201001000</td>\n",
       "      <td>10010201001001</td>\n",
       "      <td>1</td>\n",
       "      <td>30.5</td>\n",
       "      <td>30.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>10010201001001</td>\n",
       "      <td>10010201001001</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>51.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>10010201001002</td>\n",
       "      <td>10010201001002</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>13.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>10010201001003</td>\n",
       "      <td>10010201001002</td>\n",
       "      <td>1</td>\n",
       "      <td>37.5</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   index            ID10            ID20  STATE_2020  weights  eq_P10\n",
       "0      0  10010201001000  10010201001000           1     30.5    30.5\n",
       "1      1  10010201001000  10010201001001           1     30.5    30.5\n",
       "2      2  10010201001001  10010201001001           1      0.0    51.8\n",
       "3      3  10010201001002  10010201001002           1      0.0    13.3\n",
       "4      4  10010201001003  10010201001002           1     37.5     0.0"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "weighted_mapper['eq_P10'] = eq_10['weights']\n",
    "weighted_mapper.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "fc0a9241-7715-457f-b712-f87789e84646",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GISJOIN</th>\n",
       "      <th>ID20</th>\n",
       "      <th>STATE</th>\n",
       "      <th>COUNTY</th>\n",
       "      <th>P20</th>\n",
       "      <th>eq_P10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1000100201001000</td>\n",
       "      <td>10010201001000</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>21</td>\n",
       "      <td>30.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1000100201001001</td>\n",
       "      <td>10010201001001</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>34</td>\n",
       "      <td>30.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1000100201001002</td>\n",
       "      <td>10010201001002</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>29</td>\n",
       "      <td>51.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1000100201001003</td>\n",
       "      <td>10010201001003</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>17</td>\n",
       "      <td>13.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1000100201001004</td>\n",
       "      <td>10010201001004</td>\n",
       "      <td>1</td>\n",
       "      <td>Autauga County</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            GISJOIN            ID20  STATE          COUNTY  P20  eq_P10\n",
       "0  1000100201001000  10010201001000      1  Autauga County   21    30.5\n",
       "1  1000100201001001  10010201001001      1  Autauga County   34    30.5\n",
       "2  1000100201001002  10010201001002      1  Autauga County   29    51.8\n",
       "3  1000100201001003  10010201001003      1  Autauga County   17    13.3\n",
       "4  1000100201001004  10010201001004      1  Autauga County    0     0.0"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data20['eq_P10'] = weighted_mapper['eq_P10'].copy()\n",
    "data20.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "dbfc11a9-6956-4dea-9a4a-bed7cf56b552",
   "metadata": {},
   "outputs": [],
   "source": [
    "# data20.to_csv('data/mapped_blocks_full.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
